/*******************************************************************************

This code file cleans the DOB permitting database for new buildings.

Prerequisite command: a2group 
Can be obtained by: ssc install a2group

*******************************************************************************/

*** Manage settings

	run "~/Dropbox (MIT)/Research/NYC421a/code/modules/settings.do"
	
	set more off
	clear
	
*** Import data
	
	import delimited "$data/raw/DOB_Permit_Issuance.csv", encoding(ISO-8859-1)
	
*** Define universe of relevant permits

	* Keep only New Building (NB) job type
	keep if jobtype == "NB"
	
	* Keep only initial permits
	keep if filingstatus == "INITIAL"
	
	* Clean issuance date (for universe definition and analysis)
	gen issuance = clock(issuancedate,"MDY hms")
	gen issuance_date = dofc(issuance)
	gen issuance_year = year(issuance_date)
	drop issuance
	
	* Keep first issuance
	bys bin: gegen min_issuance_date = min(issuance_date)
	keep if issuance_date == min_issuance_date
	
	gen min_issuance_year = year(min_issuance_date)
	
	* Restrict permit data to relevant years
	keep if min_issuance_year >= 2003 & min_issuance_year <= 2015	
	
	drop min_issuance_year

*** Clean data

	* Destring BIN
	destring bin, replace

	* Recode borough
	
		gen borocode = .
		replace borocode = 1 if borough == "MANHATTAN"
		replace borocode = 2 if borough == "BRONX"
		replace borocode = 3 if borough == "BROOKLYN"
		replace borocode = 4 if borough == "QUEENS"
		replace borocode = 5 if borough == "STATEN ISLAND"
		
		drop borough
		rename borocode borough
		
		label define borough_label 1 "Manhattan" 2 "Bronx" 3 "Brooklyn" 4 "Queens" 5 "Staten Island"
		label values borough borough_label
	
	* Create BBL codes
		
		* Fix stray typos in block/lot codes (note: first fixes a hidden-character issue)
		replace block = "06534" if block == "O6534"
		replace lot = "28" if lot == "...28"
		
		* Destring block lot, now that it is repaired
		destring block lot, replace

		* Drop if missing component of BBL code
		drop if missing(borough) | missing(block) | missing(lot)

		tostring borough, gen(boro)
		tostring block, gen(block_)
		tostring lot, gen(lot_)

		replace block_ = "0"+block_ if length(block_)<5
		replace block_ = "0"+block_ if length(block_)<5
		replace block_ = "0"+block_ if length(block_)<5
		replace block_ = "0"+block_ if length(block_)<5

		replace lot_ = "0"+lot_ if length(lot_)<4
		replace lot_ = "0"+lot_ if length(lot_)<4
		replace lot_ = "0"+lot_ if length(lot_)<4

		gen bbl = boro+block_+lot_
		destring bbl, replace

		drop block_ lot_ boro

		format bbl %18.0f
		
	* Detect prominent builders
		
		destring permitteesphone, replace force
		destring permitteeslicense, replace
		
		preserve
		keep if missing(permitteesphone) | missing(permitteeslicense)
		tempfile tmp_append
		save `tmp_append', replace
		restore
		
		keep if !missing(permitteesphone) & !missing(permitteeslicense)
		a2group, individual(permitteesphone) unit(permitteeslicense) groupvar(permittee_id)
		
		append using `tmp_append'
		
		egen permittee_id_alt = group(permitteeslicense) if missing(permittee_id), missing
		
		summ permittee_id
		local max = r(max)
		
		replace permittee_id = permittee_id_alt + `max' if missing(permittee_id)
		drop permittee_id_alt
		
		bys permittee_id: egen permitteeslicense_ = mode(permitteeslicense)
		bys permittee_id: egen permitteesbusinessname_ = mode(permitteesbusinessname)
		
		replace permittee_id = permitteeslicense_
		drop permitteeslicense_ permitteesbusinessname
		rename permitteesbusinessname_ permitteesbusinessname
		
	* Find modal address info on permit by BIN
		
	bys bin: egen bbl_permit = mode(bbl), minmode
	bys bin: egen house_ = mode(house), minmode
	bys bin: egen streetname_ = mode(streetname), minmode
	bys bin: egen permittee_id_ = mode(permittee_id), minmode
	bys bin: egen permitteesbusinessname_ = mode(permitteesbusinessname), minmode
	
	format permittee_id_ %18.0f
	
*** Merge into PAD BIN-to-BBL crosswalk

	preserve
	
	keep bin bbl_permit issuance_date house_ streetname_ permittee_id_ permitteesbusinessname_
	duplicates drop bin bbl_permit issuance_date house_ streetname_, force
	
	rename house_ house
	rename streetname_ streetname
	rename permittee_id_ permittee_id
	rename permitteesbusinessname_ permitteesbusinessname

	* Save unmatched BINs
	*      These are BINs that cannot be matched using the PAD BIN-to-BBL crosswalk. 
	*      We clean their address data and save them to raw/unmatched_permits.dta, for later matching on address in clean_pluto.do
		
		merge 1:1 bin using "$data/clean/bin_to_bbl.dta", nogen keep(1)
		
		drop bbl
		
		format bbl_permit %18.0f
		format issuance_date %td
		
		tostring bbl_permit, replace
		gen borough = substr(bbl_permit,1,1)
		gen block = substr(bbl_permit,2,5)
		destring borough block, replace
		drop bbl_permit
		
		* Clean address
			
			* Fix internal spaces
			replace streetname = stritrim(streetname)
			
			* Regex to drop "ST"/"ND"/"RD"/"TH"(e.g., 9TH STREET -> 9 STREET)
			replace streetname = regexr(streetname,"1ST","1")
			replace streetname = regexr(streetname,"2ND","2")
			replace streetname = regexr(streetname,"3RD","3")
			replace streetname = regexr(streetname,"2TH","2")
			replace streetname = regexr(streetname,"3TH","3")
			replace streetname = regexr(streetname,"4TH","4")
			replace streetname = regexr(streetname,"5TH","5")
			replace streetname = regexr(streetname,"6TH","6")
			replace streetname = regexr(streetname,"7TH","7")
			replace streetname = regexr(streetname,"8TH","8")
			replace streetname = regexr(streetname,"9TH","9")
			replace streetname = regexr(streetname,"0TH","0")
			
			* Replace full string numbers for avenues (e.g., FIRST AVENUE --> 1 AVENUE)
			replace streetname = regexr(streetname,"FIRST","1")
			replace streetname = regexr(streetname,"THIRD","3")
			replace streetname = regexr(streetname,"EIGHTH","8")
			replace streetname = regexr(streetname,"NINTH","9")
			
			* Regex to replace contractions of street suffixes (e.g., "GREENWICH ST" -> "GREENWICH STREET")
			replace streetname = regexr(streetname," ST$"," STREET")
			replace streetname = regexr(streetname," ST.$"," STREET")
			replace streetname = regexr(streetname," RD$"," ROAD")
			replace streetname = regexr(streetname," RD.$"," ROAD")
			replace streetname = regexr(streetname," AVE$"," AVENUE")
			replace streetname = regexr(streetname," AVE.$"," AVENUE")
			replace streetname = regexr(streetname," BLVD$"," BOULEVARD")
			replace streetname = regexr(streetname," BLVD.$"," BOULEVARD")
			
			* Regex for replace contractions of "east" and "west" in street names
			replace streetname = regexr(streetname,"^E ","EAST ")
			replace streetname = regexr(streetname,"^W ","WEST ")
			replace streetname = regexr(streetname,"^E. ","EAST ")
			replace streetname = regexr(streetname,"^W. ","WEST ")
			
			* Attach house number to street name
			gen address = house + " " + streetname
			drop house streetname
			
			* Address issue w/ two observations: same borough-block-address was assigned two BINs
			bys borough block address: egen min_issuance_date = min(issuance_date)
			keep if issuance_date == min_issuance_date
			drop min_issuance_date
			
		order bin borough block address issuance_date
		save "$data/raw/unmatched_permits.dta", replace
		
	* Return to matches
	
	restore
		
		keep bin bbl_permit issuance_date permittee_id permitteesbusinessname
		duplicates drop bin bbl_permit issuance_date, force
		
		merge 1:1 bin using "$data/clean/bin_to_bbl.dta", nogen keep(3)
		
		replace bbl = bbl_permit if missing(bbl)
		drop bbl_permit
		
		format issuance_date %td
		order bin bbl issuance_date
	
** Merge in scraped 421-a status 
** Note: Merged here for only permits matched on BIN-to-BBL crosswalk, will address unmatched permits in pluto.do

	merge m:1 bbl using "$data/clean/assessments_cleaned.dta", nogen
		
	* Collapse to BBLs 
		* Note: I use the earliest issuance date by BBL across BINs to address multi-stage construction of condos
	collapse (max) flag* (min) issuance_date (firstnm) permittee_id permitteesbusinessname, by(bbl)
	
	* Save to tempfile
	tempfile tmp
	save `tmp', replace
	
** Using PAD, map (many-to-one) from unit BBLs to condo BBLs

	* Merge into condo BBL crosswalk
	merge m:1 bbl using "$data/clean/xwalk_condo_bbl.dta", nogen keep(3)
	
	* Collapse to condo BBLs
	collapse (max) flag* (min) issuance_date (firstnm) permittee_id, by(bbl_condo)
	keep if !missing(bbl_condo)
	rename bbl_condo bbl
	
	* Reappend to full dataset
	append using `tmp'
	
	collapse (max) flag* (min) issuance_date (firstnm) permittee_id permitteesbusinessname, by(bbl)
	drop if missing(issuance_date)
	
*** Save
	
	save "$data/clean/permits_cleaned.dta", replace
